import torch

from ppl_eval import ppl_metric
from transformers.models.llama import modeling_llama
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer


def get_model(model_dir):
    def skip(*args, **kwargs):
        pass
    torch.nn.init.kaiming_uniform_ = skip
    torch.nn.init.uniform_ = skip
    torch.nn.init.normal_ = skip

    config = AutoConfig.from_pretrained(model_dir, trust_remote_code=False, local_files_only=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_dir,
        config=config,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        local_files_only=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_dir,
        trust_remote_code=True,
        local_files_only=True,
        use_fast=False,  
    )
    if tokenizer.bos_token is None:
        tokenizer.bos_token = "<s>"
    if tokenizer.eos_token is None:
        tokenizer.eos_token = "</s>"
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    if tokenizer.unk_token is None:
        tokenizer.unk_token = tokenizer.eos_token
    return model, tokenizer


if __name__ == "__main__":
    print('load model...')
    model, tokenizer = get_model("../llama2-13b-local")
    model.seqlen = 128
    model.eval()

    print('start evaluate...')
    ppl_metric(model.cuda(), tokenizer, ['wikitext2'], 128, 8)
    torch.cuda.synchronize()
    print(f"[LOG] layer_num: {modeling_llama.layer_num}")
    print(f"[LOG] Skipped layers: {modeling_llama.skipped_layer_count}")
    if modeling_llama.layer_num > 0:
        print(f"[LOG] skipped ratio: {modeling_llama.skipped_layer_count/modeling_llama.layer_num}")